# Check if a package is installed and load it, otherwise install and load it
load.package <- function(package) {
  if (!require(package, character.only = TRUE)) {
    install.packages(package)
    library(package, character.only = TRUE)
  } else {
    library(package, character.only = TRUE)
  }
}

# Load required packages
load.package("dplyr")
load.package("tidyr")
load.package("tm")
load.package("udpipe")
load.package("SnowballC")
load.package("vader")
load.package("ggplot2")
load.package("ggthemes")
load.package("stringr")
load.package("qdapTools")
load.package("quanteda")
load.package("quanteda.textstats")
load.package("quantmod")
load.package("tibble")

Loading required package: tm

Loading required package: NLP


Attaching package: ‘NLP’


The following object is masked from ‘package:httr’:

    content


Loading required package: udpipe

Loading required package: SnowballC

Loading required package: vader

Loading required package: ggplot2


Attaching package: ‘ggplot2’


The following object is masked from ‘package:NLP’:

    annotate


Loading required package: ggthemes

Loading required package: stringr

Loading required package: qdapTools


Attaching package: ‘qdapTools’


The following object is masked from ‘package:dplyr’:

    id


The following object is masked from ‘package:data.table’:

    shift


Loading required package: quanteda

Package version: 3.2.4
Unicode version: 13.0
ICU version: 66.1

Parallel computing: 4 of 4 threads used.

See https://quanteda.io for tutorials and examples.


Attaching package: ‘quanteda’


The following object is masked from ‘package:tm’:

    stopwords


The following objects are masked from ‘package:NLP’:

    meta, meta<-


Loading required package: quanteda.textstats

Warning message in library(package, lib.loc = lib.loc, character.only = TRUE, logical.return = TRUE, :
“there is no package called ‘quanteda.textstats’”
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘nsyllable’, ‘proxyC’


Loading required package: quantmod

Loading required package: xts

Loading required package: zoo


Attaching package: ‘zoo’


The following object is masked from ‘package:quanteda’:

    index


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric


################################### WARNING ###################################
# We noticed you have dplyr installed. The dplyr lag() function breaks how    #
# base R's lag() function is supposed to work, which breaks lag(my_xts).      #
#                                                                             #
# Calls to lag(my_xts) that you enter or source() into this session won't     #
# work correctly.                                                             #
#                                                                             #
# All package code is unaffected because it is protected by the R namespace   #
# mechanism.                                                                  #
#                                                                             #
# Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#                                                                             #
# You can use stats::lag() to make sure you're not using dplyr::lag(), or you #
# can add conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop   #
# dplyr from breaking base R's lag() function.                                #
################################### WARNING ###################################


Attaching package: ‘xts’


The following objects are masked from ‘package:dplyr’:

    first, last


The following objects are masked from ‘package:data.table’:

    first, last


Loading required package: TTR

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Loading required package: tibble


tesla = read.csv('/kaggle/input/elon-musk-tweets-2015-to-2020/elonmusk.csv')
head(tesla, 5)


nrow(tesla)


tesla = tesla %>% arrange("date", ascending=F) %>% 
  group_by(date) %>%
  summarize(tweet = paste(tweet, collapse = " . "))
tail(tesla, 5)


# Extract corpus
corpus = Corpus(VectorSource(tesla$tweet))
corpus[[1]][1]


# Convert to lower case
corpus = tm_map(corpus, FUN = content_transformer(tolower))
corpus[[1]][1]

Warning message in tm_map.SimpleCorpus(corpus, FUN = content_transformer(tolower)):
“transformation drops documents”


# Remover URL
corpus = tm_map(corpus,
                FUN = content_transformer(FUN = function(x)gsub(pattern = 'http[[:alnum:][:punct:]]*',
                                                                replacement = ' ',x = x)))

Warning message in tm_map.SimpleCorpus(corpus, FUN = content_transformer(FUN = function(x) gsub(pattern = "http[[:alnum:][:punct:]]*", :
“transformation drops documents”


# Remove punctuation
corpus = tm_map(corpus,FUN = removePunctuation)
corpus[[1]][1]

Warning message in tm_map.SimpleCorpus(corpus, FUN = removePunctuation):
“transformation drops documents”


corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus[[1]][1]

Warning message in tm_map.SimpleCorpus(corpus, FUN = removeWords, c(stopwords("english"))):
“transformation drops documents”


corpus = tm_map(corpus,FUN = stripWhitespace)
corpus[[1]][1]

Warning message in tm_map.SimpleCorpus(corpus, FUN = stripWhitespace):
“transformation drops documents”


# download the model to perform the lemmatization
model <- udpipe_download_model(language = "english")

# load the model
ud_model <- udpipe_load_model(file = model$file_model)

# create a function to lemmatize a single document
lemmatize_document <- function(document) {
  # tokenize the document
  tokens <- udpipe_annotate(ud_model, x = document)
  
  # extract the lemma column from the tokens
  lemmas <- as.data.frame(tokens)$lemma
  
  # remove numbers, punctuation and other special characters
  lemmas <- gsub("[^[:alpha:][:space:]]*", "", lemmas)
  
  # return the lemmatized document
  return(lemmas)
}

# apply lemmatization to each document in the corpus
lemmatized_corpus <- lapply(corpus, lemmatize_document)

# print the lemmatized corpus
head(lemmatized_corpus, 5)

Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /kaggle/working/english-ewt-ud-2.5-191206.udpipe

 - This model has been trained on version 2.5 of data from https://universaldependencies.org

 - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0

 - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.

 - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')

Downloading finished, model stored at '/kaggle/working/english-ewt-ud-2.5-191206.udpipe'

Warning message in read_connlu(x, is_udpipe_annotation = TRUE, ...):
“No parsed data in x$conllu, returning default empty data.frame. Error message at x$error indicates e.g.: ”
Warning message in read_connlu(x, is_udpipe_annotation = TRUE, ...):
“No parsed data in x$conllu, returning default empty data.frame. Error message at x$error indicates e.g.: ”


# create a function to stem a single document
stem_document <- function(document) {
  # stem the words in the document
  stems <- wordStem(document, language = "english")
  
  # return the stemmed document
  return(stems)
}

# apply stemming to each document in the corpus
stemmed_corpus <- lapply(lemmatized_corpus, stem_document)

# print the stemmed corpus
head(stemmed_corpus, 5)


# Removing non-alpha characters
# create a function to remove non-alpha characters from a document
clean_document <- function(document) {
  # remove all characters that are not alphabets or white spaces
  clean_doc <- gsub("[^[:alpha:][:space:]]*", "", document)
  
  # return the cleaned document
  return(clean_doc)
}

# apply cleaning to each document in the lemmatized or stemmed corpus
cleaned_corpus <- lapply(stemmed_corpus, clean_document)

# print the cleaned corpus
head(cleaned_corpus, 5)


# TF-IDF
dtm_tfidf = DocumentTermMatrix(x=cleaned_corpus,
                               control = list(normalize=TRUE))

xdtm_tfidf = removeSparseTerms(dtm_tfidf,sparse = 0.95)
xdtm_tfidf = as.data.frame(as.matrix(xdtm_tfidf))
sort(colSums(xdtm_tfidf),decreasing = T)


head(xdtm_tfidf, 10)


tesla_tfidf = cbind(tesla, xdtm_tfidf)
head(tesla_tfidf, 3)


# Create empty lists
word_scores_list = list()
compound_list = list()
pos_list = list()
neu_list = list()
neg_list = list()
but_count_list = list()

# Loop through tweets
for (row in tesla$tweet) {
  # Get sentiment scores
  sentiment = get_vader(row, incl_nt = T, neu_set = T, rm_qm = T)
  
  # Append scores to lists
  word_scores_list[[length(word_scores_list) + 1]] = sentiment["word_scores"]
  compound_list[[length(compound_list) + 1]] = sentiment["compound"]
  pos_list[[length(pos_list) + 1]] = sentiment["pos"]
  neu_list[[length(neu_list) + 1]] = sentiment["neu"]
  neg_list[[length(neg_list) + 1]] = sentiment["neg"]
  but_count_list[[length(but_count_list) + 1]] = sentiment["but_count"]
}

# Combine lists into data frame
vader_df = data.frame(
  word_scores = unlist(word_scores_list),
  compound = unlist(compound_list),
  pos = unlist(pos_list),
  neu = unlist(neu_list),
  neg = unlist(neg_list),
  but_count = unlist(but_count_list)
)

colnames(vader_df)


tesla_sentiment = cbind(tesla_tfidf, vader_df)
print(nrow(tesla_sentiment))
head(tesla_sentiment, 5)

[1] 1360


write.csv(tesla_sentiment, 'pre_finance.csv')


# define the stock symbol and data range
symbol <- "TSLA"
start_date <- as.Date("2015-01-29")
end_date <- as.Date("2020-07-15")

# get the stock price data from Yahoo Finance
getSymbols(symbol, src = "yahoo", from = start_date, to = end_date)

# extract the adjusted close price data
tsla_prices <- Ad(get(symbol))

tsla_prices <- as.data.frame(tsla_prices) %>% 
  rownames_to_column(var = "date")

# view the last 10 prices
tail(tsla_prices)


tesla_sentiment$date = as.Date(tesla_sentiment$date)
tsla_prices$date = as.Date(tsla_prices$date)
tail(tesla_sentiment, 3)


colnames(tesla_sentiment)


merged_df = merge(tesla_sentiment, tsla_prices, by="date", all.y = TRUE)

# Add increase column
merged_df$increase <- c(0, ifelse(diff(merged_df$TSLA.Adjusted) > 0, 1, 0))
merged_df[,3:ncol(merged_df)-1][is.na(merged_df[, 3:ncol(merged_df)-1])] = 0
tail(merged_df, 2)


write.csv(merged_df, file="tesla_sentiment.csv", row.names = F)


tesla_sentiment = read.csv('/kaggle/input/pre-data-visualization/tesla_sentiment.csv')
head(tesla_sentiment, 2)


# Remove 'X.' and '..' from column names
new_col_names = gsub('X\\.|\\.\\.', '', colnames(tesla_sentiment))
# Assign new column names to the data frame
colnames(tesla_sentiment) <- new_col_names
# Drop columns "", "m", "x", ".1", and "s"
final_data_new <- tesla_sentiment[, !names(tesla_sentiment) %in% c('', 'm', 'x', '.1', 's')]
head(final_data_new, 5)


#visualizing increase vs decrease 
ggplot(data=final_data_new,aes(x=increase))+
  geom_bar(fill=c('orange', 'brown'))+
  theme_bw()+
  xlab('Stock Increase(0) or Decrease(1)') +
coord_flip()


final_data_new_visual <- final_data_new[, !names(final_data_new) %in% c('word_scores', 'tweet','date','compound', 'pos', 'neu', 'neg', 'but_count', 'TSLA.Adjusted', 'increase')]
head(final_data_new_visual, 3)


#visualizing top 25 words appearing in Elon's Tweet
col_sums <- colSums(final_data_new_visual)

final_data_new_visual_sums <- data.frame(
  word = names(col_sums),
  value = col_sums
)
final_data_new_visual_sums_top25 <- final_data_new_visual_sums[order(final_data_new_visual_sums$value, decreasing = TRUE),][1:25,]
# visualize the results with a bar plot
ggplot(data = final_data_new_visual_sums_top25, aes(x = reorder(word, value), y = value)) +
  geom_bar(stat = "identity", fill = "sienna3") +
  labs(x = "Word", y = "Sum of Values") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  coord_flip()


library(lubridate)
final_data_new$date <- as.Date(final_data_new$date, format = "%Y-%m-%d")

final_data_new$quarter <- quarter(final_data_new$date)
final_data_new$year <- year(final_data_new$date)

df_yearly <- final_data_new %>%
  mutate(year = year(date)) %>%
  group_by(year) %>%
  summarise(pos = mean(pos), neu = mean(neu), compound = mean(compound), neg = mean(neg), TSLA.Adjusted = mean(TSLA.Adjusted))

head(final_data_new, 5)


#Sentiment Score over time vs Stock Price
# Create the first plot for sentiment scores
plot1 <- ggplot(data = df_yearly, aes(x = year)) +
         geom_line(aes(y = pos, color = "Positive")) +
         geom_line(aes(y = neu, color = "Neutral")) +
         geom_line(aes(y = compound, color = "Compound")) +
         geom_line(aes(y = neg, color = "Negative")) +
         ylab("Sentiment Score") +
         ggtitle("Sentiment Scores Over Time") +
         scale_color_manual(values = c("Positive" = "blue", "Neutral" = "grey",
                                       "Compound" = "black", "Negative" = "red"))

# Create the second plot for TSLA adjusted stock prices
plot2 <- ggplot(data = df_yearly, aes(x = year)) +
         geom_line(aes(y = TSLA.Adjusted)) +
         ylab("Adjusted Stock Price") +
         ggtitle("TSLA Adjusted Stock Prices Over Time")

# Combine the two plots side by side using grid.arrange() from the gridExtra package
library(gridExtra)
grid.arrange(plot1, plot2, ncol = 2,widths=c(2/3, 1/3))


#Visualizing top positive words and top negative words
library(tidytext)
bing <- read.csv("/kaggle/input/bing-lexicon/bing-2.csv")
bing_sentiment <- final_data_new_visual_sums%>%
  inner_join(bing)%>%
  group_by(sentiment)
# filter to only include positive and negative sentiment
bing_sentiment_pos <- bing_sentiment[bing_sentiment$sentiment == "positive", ]
bing_sentiment_neg <- bing_sentiment[bing_sentiment$sentiment == "negative", ]

# arrange in descending order by value
bing_sentiment_pos <- bing_sentiment_pos[order(-bing_sentiment_pos$value), ]
bing_sentiment_neg <- bing_sentiment_pos[order(-bing_sentiment_neg$value), ]

# select top 25 rows
bing_sentiment_pos_top25 <- head(bing_sentiment_pos, 25)
bing_sentiment_neg_top25 <- head(bing_sentiment_neg, 25)

# Define color palette
my_colors <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#999999")

# Create bar plot for positive words with custom colors
plot_pos <- ggplot(bing_sentiment_pos_top25, aes(x = reorder(word, value), y = value, fill = word)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = my_colors[1:length(unique(bing_sentiment_pos_top25$word))]) +
  labs(title = "Top 25 Positive Words", x = "Word", y = "Value") + 
  coord_flip()

# Create bar plot for negative words with custom colors
plot_neg <- ggplot(bing_sentiment_neg_top25, aes(x = reorder(word, value), y = value, fill = word)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = my_colors[1:length(unique(bing_sentiment_neg_top25$word))]) +
  labs(title = "Top 25 Negative Words", x = "Word", y = "Value") 

# Combine the two plots side by side using grid.arrange() from the gridExtra package
library(gridExtra)
grid.arrange(plot_pos, plot_neg, ncol = 2, widths=c(1/2, 1/2))

Joining with `by = join_by(word)`


final_data_new$emojis <- str_extract_all(final_data_new$tweet, "[\U0001F300-\U0001F64F]|\U0001F680|\U0001F6A5|\U0001F6B2|\U0001F30D|\U0001F30E|\U0001F31E|\U0001F363|\U0001F377|\U0001F37B|\U0001F41F|\U0001F42C|\U0001F355|\U0001F419|\U0001F680|\u2600-\u26FF]")
tail(final_data_new, 2)


final_data_new$emojis = replace_na(final_data_new$emojis, 0)

len_emojis = function(col){
    n_emojis = length(col)
    return(n_emojis)
}
final_data_new$n_emojis = sapply(final_data_new$emojis, FUN=len_emojis)
tail(final_data_new)


# Correlation between number of emojis used and stock price
cor(final_data_new$TSLA.Adjusted, final_data_new$n_emojis)


emoji_table <- mtabulate(final_data_new$emojis)
colnames(emoji_table) <- paste0("emoji_", colnames(emoji_table))
d = cbind(emoji_table, final_data_new$TSLA.Adjusted)
tail(d, 20)


# select first and last column
df_new <- d %>% select(col1=colnames(d)[1], price='final_data_new$TSLA.Adjusted') %>%
        filter(col1 != 0) %>%
        summarise(avg_price = mean(price))
df_new[1,1]


price_by_emoji = list()

col_names = colnames(emoji_table)
for (col_name in col_names) {
    df_new <- d %>% select(col1=col_name, price='final_data_new$TSLA.Adjusted') %>%
        filter(col1 != 0) %>%
        summarise(avg_price = mean(price))
    
    price_by_emoji[[col_name]] = df_new[1,1]
}

Warning message:
“Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(col_name)

  # Now:
  data %>% select(all_of(col_name))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.”


df <- data.frame(emoji = names(price_by_emoji), avg_stock_price = unlist(price_by_emoji))
df %>% arrange(desc(avg_stock_price)) %>% select(-1)


# 2-word frequency (2 gram)
# create Tokens object from stemmed corpus
tokens_stemmed <- tokens(stemmed_corpus)
dfm2 = dfm(tokens_ngrams(tokens_stemmed,n=2))

dfFreq2 = textstat_frequency(dfm2)
library(ggplot2)
ggplot(dfFreq2[1:40,], aes(x=reorder(feature, frequency), y=frequency)) + 
  geom_col() +
  coord_flip() +
  scale_x_discrete(name = "2 gram") +
  theme(text=element_text(size=12))


#2-grams features to data frame
dtm2 <- convert(dfm2, to = "tm")
data_dtm2 = as.data.frame(as.matrix(dtm2))

head(data_dtm2)


# 3-word frequency (3 gram)
# create Tokens object from stemmed corpus
dfm3 = dfm(tokens_ngrams(tokens_stemmed,n=3))

dfFreq3 = textstat_frequency(dfm3)

ggplot(dfFreq3[1:40,], aes(x=reorder(feature, frequency), y=frequency)) + 
  geom_col() +
  coord_flip() +
  scale_x_discrete(name = "3 gram") +
  theme(text=element_text(size=12))


#3-grams features to data frame
dtm3 <- convert(dfm3, to = "tm")
data_dtm3 = as.data.frame(as.matrix(dtm3))
head(data_dtm3)


# Merging datasets of ngrams
tesla_sentiment = read.csv('/kaggle/input/pre-finance/pre_finance.csv')
ngrams_data = cbind(tesla_sentiment, data_dtm2, data_dtm3)
head(ngrams_data, 2)


merged_df = merge(ngrams_data, tsla_prices, by="date", all.y = TRUE)

# Add increase column
merged_df$increase <- c(0, ifelse(diff(merged_df$TSLA.Adjusted) > 0, 1, 0))
merged_df[,3:ncol(merged_df)-1][is.na(merged_df[, 3:ncol(merged_df)-1])] = 0
tail(merged_df, 2)


merged_df[,3:ncol(merged_df)-1][is.na(merged_df[, 3:ncol(merged_df)-1])] = 0


write.csv(merged_df, file="final_data.csv", row.names = F)


library(data.table)
merged_df = fread('/kaggle/input/pre-data-visualization/tesla_sentiment.csv')

head(merged_df, 2)


merged_df$date <- as.Date(merged_df$date, format="%Y-%m-%d")
merged_df$date <- as.numeric(merged_df$date)


time_series = merged_df %>% select(TSLA.Adjusted)
time_series = scale(time_series)

nlp_data = merged_df %>% select(-date, -TSLA.Adjusted, -increase, -word_scores, -tweet)
nlp_data = scale(nlp_data)

labels = merged_df$increase

Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


ncol(time_series)
nrow(time_series)
ncol(nlp_data)
nrow(nlp_data)


time_series = as.matrix(time_series)


library(keras)
max_time_steps <- nrow(time_series)
num_features <- ncol(time_series)
max_words <- nrow(nlp_data)
embedding_dim <- ncol(nlp_data)
lstm_units <- 32
dense_units <- 64

# Reshape time series data to match the input shape of the model
time_series <- array(data = time_series, dim = c(nrow(time_series), max_time_steps, num_features))

# Reshape NLP data to match the input shape of the model
nlp_data <- array(data = nlp_data, dim = c(nrow(nlp_data), max_words, embedding_dim))

# Define input shapes for time series and NLP data
ts_input_shape <- c(max_time_steps, num_features)
nlp_input_shape <- c(max_words, embedding_dim)

# Define LSTM-based model architecture
ts_input <- layer_input(shape = ts_input_shape, name = "ts_input")
ts_lstm <- layer_lstm(units = lstm_units, return_sequences = TRUE)(ts_input)
ts_output <- layer_global_max_pooling_1d()(ts_lstm)

nlp_input <- layer_input(shape = nlp_input_shape, name = "nlp_input")
nlp_lstm <- layer_lstm(units = lstm_units)(nlp_input)
nlp_output <- layer_dense(units = dense_units, activation = "relu")(nlp_lstm)

combined <- layer_concatenate(inputs = list(ts_output, nlp_output), name = "combined")
output <- layer_dense(units = 1, activation = "sigmoid")(combined)

model <- keras_model(inputs = list(ts_input, nlp_input), outputs = output)

# Compile the model
model %>% compile(
  loss = "binary_crossentropy",
  optimizer = optimizer_adam(),
  metrics = c("accuracy")
)

# Define a callback to save the model with the best accuracy
best_model_callback <- callback_model_checkpoint(
  filepath = "best_model.h5",
  save_best_only = TRUE,
  monitor = "val_accuracy",
  mode = "max"
)

# Train the model with early stopping and the best model callback
history <- model %>% fit(
  x = list(time_series, nlp_data),
  y = labels,
  epochs = 2000,
  batch_size = 32,
  validation_split = 0.2,
  verbose = 2,
  callbacks = list(
    callback_early_stopping(patience = 100),
    best_model_callback
  )
)

# Evaluate the model
scores <- model %>% evaluate(
  x = list(time_series, nlp_data),
  y = labels,
  batch_size = 32
)

# Print the evaluation metrics
cat("Test loss:", scores[[1]], "\n")
cat("Test accuracy:", scores[[2]], "\n")

Test loss: 0.8049979 
Test accuracy: 0.7503639


split_idx <- round(nrow(merged_df) * 0.8)

library(dplyr)
time_series_train <- time_series[1:split_idx, ]
time_series_train <- as.matrix(time_series_train)
time_series_test <- time_series[(split_idx + 1):nrow(time_series), ]
time_series_test <- as.matrix(time_series_test)

nlp_data_train <- nlp_data[1:split_idx, ]
nlp_data_test <- nlp_data[(split_idx + 1):nrow(nlp_data), ]

train_labels = merged_df[1:split_idx, ]$increase
test_labels <- merged_df[(split_idx + 1):nrow(merged_df), ]$increase


library(keras)

max_time_steps <- ncol(time_series_train)
num_features <- nrow(time_series_train)
max_words <- ncol(nlp_data_train)
embedding_dim <- nrow(nlp_data_train)
lstm_units <- 32
dense_units <- 64
dropout_rate <- 0.1
learning_rate <- 0.01 

# Reshape time series data to match the input shape of the model
time_series_train <- array(data = time_series_train, dim = c(nrow(time_series_train), max_time_steps, num_features))
time_series_test <- array(data = time_series_test, dim = c(nrow(time_series_test), max_time_steps, num_features))

# Reshape NLP data to match the input shape of the model
nlp_data_train <- array(data = nlp_data_train, dim = c(nrow(nlp_data_train), max_words, embedding_dim))
nlp_data_test <- array(data = nlp_data_test, dim = c(nrow(nlp_data_test), max_words, embedding_dim))

# Define input shapes for time series and NLP data
ts_input_shape <- c(max_time_steps, num_features)
nlp_input_shape <- c(max_words, embedding_dim)

# Define LSTM-based model architecture
ts_input <- layer_input(shape = ts_input_shape, name = "ts_input")
ts_lstm <- layer_lstm(units = lstm_units, return_sequences = TRUE)(ts_input)
ts_dropout <- layer_dropout(rate = dropout_rate)(ts_lstm)  
ts_output <- layer_global_max_pooling_1d()(ts_dropout)

nlp_input <- layer_input(shape = nlp_input_shape, name = "nlp_input")
nlp_lstm <- layer_lstm(units = lstm_units)(nlp_input)
nlp_dropout <- layer_dropout(rate = dropout_rate)(nlp_lstm) 
nlp_output <- layer_dense(units = dense_units, activation = "relu")(nlp_dropout)

combined <- layer_concatenate(inputs = list(ts_output, nlp_output), name = "combined")
output <- layer_dense(units = 1, activation = "sigmoid")(combined)

model <- keras_model(inputs = list(ts_input, nlp_input), outputs = output)

# Define the Nadam optimizer with a specific learning rate
optimizer <- optimizer_nadam(learning_rate = learning_rate)

# Define early stopping based on validation accuracy
early_stopping <- callback_early_stopping(
  patience = 100,  
  monitor = "val_accuracy",
  mode = "max"
)

# Compile the model
model %>% compile(
  loss = "binary_crossentropy",
  optimizer = optimizer,
  metrics = c("accuracy")
)

# Train the model
history <- model %>% fit(
  x = list(time_series_train, nlp_data_train),
  y = train_labels,
  epochs = 2000,  
  batch_size = 32,
  validation_data = list(list(time_series_test, nlp_data_test), test_labels),
  callbacks = list(
    callback_model_checkpoint("best_model_advanced.h5", save_best_only = TRUE, verbose = 1),
    early_stopping 
  )
)


# Save the best model as h5 file
save_model_hdf5(model, "best_model_advanced.h5")


# Create output directory if it doesn't exist
dir.create("output")

# Copy model file from input directory to output directory
file.copy(from = "/kaggle/input/model4/best_model6.h5", 
          to = "/kaggle/working/output/best_model6.h5")


# Load the best saved model
library('keras')
model <- load_model_hdf5("/kaggle/working/best_model_advanced.h5")

# Evaluate the model on the test data
test_loss_and_metrics <- model %>% evaluate(
  x = list(time_series_test, nlp_data_test),
  y = test_labels,
  batch_size = 32
)

# Print the test loss and accuracy
cat(paste0("Test loss: ", test_loss_and_metrics[1], "\n"))
cat(paste0("Test accuracy: ", test_loss_and_metrics[2], "\n"))

Test loss: 0.684055924415588
Test accuracy: 0.556363642215729

	id	conversation_id	created_at	date	time	timezone	user_id	username	name	place	⋯	geo	source	user_rt_id	user_rt	retweet_id	reply_to	retweet_date	translate	trans_src	trans_dest
	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<chr>	<int>	<chr>	<chr>	<lgl>	⋯	<lgl>	<lgl>	<lgl>	<lgl>	<lgl>	<chr>	<lgl>	<lgl>	<lgl>	<lgl>
1	1.282940e+18	1.282933e+18	1.594712e+12	2020-07-14	07:28:03	UTC	44196397	elonmusk	Elon Musk	NA	⋯	NA	NA	NA	NA	NA	[{'user_id': '44196397', 'username': 'elonmusk'}, {'user_id': '1308211178', 'username': 'Teslarati'}]	NA	NA	NA	NA
2	1.282845e+18	1.282802e+18	1.594689e+12	2020-07-14	01:10:26	UTC	44196397	elonmusk	Elon Musk	NA	⋯	NA	NA	NA	NA	NA	[{'user_id': '44196397', 'username': 'elonmusk'}, {'user_id': '275295731', 'username': 'davidtayar5'}, {'user_id': '1046173159580479490', 'username': 'TeslaLisa'}, {'user_id': '999632398525530114', 'username': 'SteveHamel16'}, {'user_id': '1689516060', 'username': 'vincent13031925'}, {'user_id': '717042249624854529', 'username': 'S_Padival'}, {'user_id': '980554784133427200', 'username': 'BarkMSmeagol'}, {'user_id': '23771947', 'username': 'annerajb'}, {'user_id': '12153682', 'username': 'PJHORNAK'}, {'user_id': '2638809573', 'username': 'WPipperger'}, {'user_id': '316068880', 'username': 'EcoHeliGuy'}]	NA	NA	NA	NA
3	1.282806e+18	1.282759e+18	1.594680e+12	2020-07-13	22:34:13	UTC	44196397	elonmusk	Elon Musk	NA	⋯	NA	NA	NA	NA	NA	[{'user_id': '44196397', 'username': 'elonmusk'}, {'user_id': '319128454', 'username': 'katlinegrey'}]	NA	NA	NA	NA
4	1.282800e+18	1.282672e+18	1.594678e+12	2020-07-13	22:12:52	UTC	44196397	elonmusk	Elon Musk	NA	⋯	NA	NA	NA	NA	NA	[{'user_id': '44196397', 'username': 'elonmusk'}, {'user_id': '1689516060', 'username': 'vincent13031925'}]	NA	NA	NA	NA
5	1.282800e+18	1.282739e+18	1.594678e+12	2020-07-13	22:12:26	UTC	44196397	elonmusk	Elon Musk	NA	⋯	NA	NA	NA	NA	NA	[{'user_id': '44196397', 'username': 'elonmusk'}, {'user_id': '17217640', 'username': 'SpaceflightNow'}]	NA	NA	NA	NA

date	tweet
<chr>	<chr>
2020-07-10	I spoke with Korolev’s family today. He was one of the very best. Корольов / Королёв. . In general, we need to improve how podcasts play . SPQR https://m.youtube.com/watch?v=wjOfQfxmTLQ … . True haha . Death is the loss of information . Earning power post augmentation would easily pay for itself (if that’s even necessary). This is the best thing I can think of to ensure that collective human will decides the future. . Absolutely . 👀 . Probably a good one to design & engineer in Germany . Even more . Ok . 👀 . Berlin Model Y is the one to watch. That is a revolution in automotive body engineering (finally). . True . Wow, IHOP & GitHub are close . Best use of the term “Full Stack”? . For sure. This is both great & terrifying. Everything we’ve ever sensed or thought has been electrical signals. The early universe was just a soup of quarks & leptons. How did a very small piece of the Universe start to think of itself as sentient? . Yes . Sorry, should hopefully be soon! . Not actually a payout, just a vesting of stock options. It may never pay out, as the stock can’t be sold for 5 years after exercise. The stock must be bought & income taxes paid, then hold value for 5 years. . No easy way to answer this in a tweet, but helping with dire brain injuries is our first priority. Details Aug 28. . AI symbiosis while u wait
2020-07-11	Haha . Maybe he should design flag of Mars . 48 65 78 20 74 6f 20 74 65 78 74 . pic.twitter.com/1MQXFAKPzf
2020-07-12	Thanks :) . That is the near-term danger of AI . I didn’t mind DA2. DA1 was awesome. Mass Effect 2 (talking about sequels) is amazing.
2020-07-13	Reusability is essential. A rocket that is single use is just as absurd as a single use airplane. F9 engines already fire 3 times per flight. . Wild times! . We’re being extra paranoid. Maximizing probability of successful launch is paramount. . Welcome anytime . Well, I do care very much about sustainability of civilization, but there is some truth to the irony part haha . Yes, in plan. Superchargers and public high power wall connectors will keep growing exponentially every year. . 👀 . I think so . Doing range testing now. Number will be significantly higher than 300. Extremely good for any EV, especially an SUV. . We have reduced pricing on Model Y LR dual motor & will offer a LR single motor Y in a few months, which improves affordability, while still keeping the product excellent . Yes . It may be able to reach 250kW at low states of charge . No, as range would be unacceptably low (< 250 mile EPA) . We had to increase some wire thicknesses in S/X to reduce resistive heating. Technically, won’t be quite 1000 mph charging, as X especially is much bigger than 3. . Hell of a ride!
2020-07-14	Cute . Wow

	"can",	"test",	"",	"almost",	"give",	"go",	"good",	"launch",	"m",	"much",	⋯	"improv",	"make",	"engin",	"sound",	"earth",	"order",	"exact",	"part",	"pretti",	"starship",
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	1	1	0	0	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	0	0	0
2	0	0	4	1	1	1	1	3	1	1	⋯	0	0	0	0	0	0	0	0	0	0
3	0	0	2	0	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	0	0	0
4	0	0	2	0	1	0	3	1	1	0	⋯	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	0	0	0
6	0	0	0	0	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	0	0	0
8	0	0	2	0	0	0	0	1	0	0	⋯	0	0	0	0	0	0	0	0	0	0
9	0	0	1	0	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	0	0	0
10	1	0	0	0	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	0	0	0

	date	tweet	"can",	"test",	"",	"almost",	"give",	"go",	"good",	"launch",	⋯	"improv",	"make",	"engin",	"sound",	"earth",	"order",	"exact",	"part",	"pretti",	"starship",
	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	2015-01-30	If you are curious about the P85D, you can schedule a test drive here: http://ts.la/dE	1	1	0	0	0	0	0	0	⋯	0	0	0	0	0	0	0	0	0	0
2	2015-02-08	Prob good though. Will give us time to replace 1st stage video transmitter (not needed for launch, but nice to have). . Air Force tracking radar went down. Launch postponed to same time tomorrow. . Rocket reentry will be much tougher this time around due to deep space mission. Almost 2X force and 4X heat. Plenty of hydraulic fluid tho. . Launching our 1st deep space mission today. Headed to Earth-Sun L1 gravity null point at 1M miles, 4X further than moon.	0	0	4	1	1	1	1	3	⋯	0	0	0	0	0	0	0	0	0	0
3	2015-02-10	Launch postponed to tomorrow due to high winds at the Cape, but Dragon still inbound from orbit in 90 mins . Extreme wind shear over Cape Canaveral. Feels like a sledgehammer when supersonic in the vertical. Hoping it changes … . "What Are The Civilian Applications?" https://m.youtube.com/watch?v=M8YjvHYbZ9w …	0	0	2	0	0	0	0	0	⋯	0	0	0	0	0	0	0	0	0	0

	date	tweet	"can",	"test",	"",	"almost",	"give",	"go",	"good",	"launch",	⋯	"exact",	"part",	"pretti",	"starship",	word_scores	compound	pos	neu	neg	but_count
	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
1	2015-01-30	If you are curious about the P85D, you can schedule a test drive here: http://ts.la/dE	1	1	0	0	0	0	0	0	⋯	0	0	0	0	{0, 0, 0, 1.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}	0.318	0.141	0.859	0	0
2	2015-02-08	Prob good though. Will give us time to replace 1st stage video transmitter (not needed for launch, but nice to have). . Air Force tracking radar went down. Launch postponed to same time tomorrow. . Rocket reentry will be much tougher this time around due to deep space mission. Almost 2X force and 4X heat. Plenty of hydraulic fluid tho. . Launching our 1st deep space mission today. Headed to Earth-Sun L1 gravity null point at 1M miles, 4X further than moon.	0	0	4	1	1	1	1	3	⋯	0	0	0	0	{0, 0.95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}	0.67	0.088	0.887	0.025	1
3	2015-02-10	Launch postponed to tomorrow due to high winds at the Cape, but Dragon still inbound from orbit in 90 mins . Extreme wind shear over Cape Canaveral. Feels like a sledgehammer when supersonic in the vertical. Hoping it changes … . "What Are The Civilian Applications?" https://m.youtube.com/watch?v=M8YjvHYbZ9w …	0	0	2	0	0	0	0	0	⋯	0	0	0	0	{0, -0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.25, 0, 0, 0, 0, 0, 0, 0, 2.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}	0.785	0.139	0.835	0.027	1
4	2015-02-11	Rocket soft landed in the ocean within 10m of target & nicely vertical! High probability of good droneship landing in non-stormy weather. . Primary mission on target. Spacecraft head towards the sun! All good there. . @DanielLockyer We could actually do that...maybe we should . Planning a significant upgrade of the droneship for future missions to handle literally anything. Maybe give it a Merlin for good measure :) . Can't delay any longer. Must proceed with primary mission to launch the Deep Space Climate Observatory spacecraft. . Mega storm preventing droneship from remaining on station, so rocket will try to land on water. Survival probability <1%. . Coming home pic.twitter.com/FmrmYs6R6V . Dragon splashdown off the California coast pic.twitter.com/4Bvfmei8I3	0	0	2	0	1	0	3	1	⋯	0	0	0	0	{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.9, 0, 0, 0, 0, 1.9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.9, 0, 2, 0, 0, 0.962, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}	0.95	0.146	0.846	0.008	0
5	2015-02-12	Landing on a stormy sea pic.twitter.com/7EY25g3IU5	0	0	0	0	0	0	0	0	⋯	0	0	0	0	{0, 0, 0, 0, 0, 0}	0	0	1	0	0

Import Libraries¶

Import Data¶

Data Preparation¶

Group by date¶

Create Corpus¶

Convert to lower case¶

Remove URL¶

Remove punctuation¶

Remove StopWords¶

Strip whitespaces¶

Lemmatization¶

Stemming¶

Removing non-alpha characters¶

Vectorization¶

Merging datasets¶

Sentiment Analysis¶

Merging Data¶

Import Stock Data¶

Merge Data¶

Exploratory Analysis¶

Emoji Correlation with Stock Price¶

Feature Engineering¶

2 grams¶

3 grams¶

Fill NAs tokens and sentiment scores with 0s¶

Export to csv¶

Data Modeling¶

Model 1 (Simple Version)¶

Model 2 (Advanced Version)¶

Conclusion¶

	date	TSLA.Adjusted
	<chr>	<dbl>
1369	2020-07-07	92.65733
1370	2020-07-08	91.05867
1371	2020-07-09	92.95200
1372	2020-07-10	102.97667
1373	2020-07-13	99.80400
1374	2020-07-14	101.12000

	date	tweet	"can",	"test",	"",	"almost",	"give",	"go",	"good",	"launch",	⋯	"exact",	"part",	"pretti",	"starship",	word_scores	compound	pos	neu	neg	but_count
	<date>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
1358	2020-07-12	Thanks :) . That is the near-term danger of AI . I didn’t mind DA2. DA1 was awesome. Mass Effect 2 (talking about sequels) is amazing.	0	0	1	0	0	0	0	0	⋯	0	0	0	0	{1.9, 2, 0, 0, 0, 0, 0, -2.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.1, 0, 0, 0, 0, 0, 0, 0, 2.8}	0.886	0.361	0.55	0.089	0
1359	2020-07-13	Reusability is essential. A rocket that is single use is just as absurd as a single use airplane. F9 engines already fire 3 times per flight. . Wild times! . We’re being extra paranoid. Maximizing probability of successful launch is paramount. . Welcome anytime . Well, I do care very much about sustainability of civilization, but there is some truth to the irony part haha . Yes, in plan. Superchargers and public high power wall connectors will keep growing exponentially every year. . 👀 . I think so . Doing range testing now. Number will be significantly higher than 300. Extremely good for any EV, especially an SUV. . We have reduced pricing on Model Y LR dual motor & will offer a LR single motor Y in a few months, which improves affordability, while still keeping the product excellent . Yes . It may be able to reach 250kW at low states of charge . No, as range would be unacceptably low (< 250 mile EPA) . We had to increase some wire thicknesses in S/X to reduce resistive heating. Technically, won’t be quite 1000 mph charging, as X especially is much bigger than 3. . Hell of a ride!	0	1	7	0	0	0	1	1	⋯	0	1	0	0	{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5, 0, 0, 0, 1.4, 0, 0, 0, 0, 1, 0, 0, 0.55, 0, 0, 1.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.95, 0, 0, -0.3, 0, 3, 0, 2.55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.45, 0, 0, 0, 0, 0, 0, 0, 3.2895, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.7, 0, 0, 0, 0, 0, 0, 4.05, 0, 2.55, 0, 0, 0, 0, 0, 0, 0.15, 0, 0, -1.65, 0, 0, 0, 0, -1.8, 0, 0, 0, 0, 0, -1.65, 0, 0, 0, 0, 0, 0, 0, 0, 1.95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -5.4, 0, 0, 0}	0.973	0.18	0.742	0.079	1
1360	2020-07-14	Cute . Wow	0	0	0	0	0	0	0	0	⋯	0	0	0	0	{2, 0, 2.8}	0.778	0.872	0.128	0	0

	avg_stock_price
	<dbl>
emoji_👀	84.11845
emoji_👸	52.17200
emoji_🔭	47.96400
emoji_👆	46.75467
emoji_💯	42.75367
emoji_🕺	42.72067
emoji_🏻	41.62100
emoji_😅	40.66311
emoji_😮	38.20000
emoji_💕	37.68836
emoji_💘	36.74433
emoji_🍻	36.48000
emoji_🔥	34.06458
emoji_🐈	33.29817
emoji_😢	33.23467
emoji_👶	32.08933
emoji_🎶	31.43717
emoji_😎	29.15167
emoji_🎄	28.72933
emoji_👻	28.43200
emoji_👍	28.11459
emoji_🎥	27.88867
emoji_🖤	24.82805
emoji_🌏	24.75355
emoji_🎁	24.66133
emoji_🍜	24.38067
emoji_😀	23.72745
emoji_😉	23.62442
emoji_🌃	23.50333
emoji_🍒	23.32867
⋮	⋮
emoji_🐿	19.45400
emoji_🎼	19.42800
emoji_🐉	19.25767
emoji_👌	19.23480
emoji_🍃	18.89433
emoji_🙏	18.84578
emoji_🚀	18.60696
emoji_😜	18.44567
emoji_👽	18.43967
emoji_🐌	18.16400
emoji_😲	17.86378
emoji_🐶	17.54933
emoji_😇	17.52533
emoji_😔	17.44333
emoji_🍁	17.24400
emoji_🍂	17.24400
emoji_🐏	17.04533
emoji_🐐	17.00200
emoji_🍷	16.90267
emoji_👁	16.44000
emoji_🎤	16.30200
emoji_🕳	16.30200
emoji_💣	16.23267
emoji_💦	16.23267
emoji_🏴	15.46333
emoji_😋	15.18667
emoji_🍀	15.16900
emoji_💰	14.79067
emoji_😐	14.06867
emoji_🐣	12.34400

date	tweet	""can"",	""test"",	"""",	""almost"",	""give"",	""go"",	""good"",	""launch"",	⋯	""pretti"",	""starship"",	word_scores	compound	pos	neu	neg	but_count	TSLA.Adjusted	increase
<IDate>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	⋯	<int>	<int>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<int>	<dbl>	<int>
2015-01-29	0	0	0	0	0	0	0	0	0	⋯	0	0	0	0.000	0.000	0.000	0	0	13.68000	0
2015-01-30	If you are curious about the P85D, you can schedule a test drive here: http://ts.la/dE	1	1	0	0	0	0	0	0	⋯	0	0	{0, 0, 0, 1.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}	0.318	0.141	0.859	0	0	13.57333	0